################################################
## Does peer review improve the statistical   ##
## content of manuscripts?                    ##
## A study on 27,467 manuscripts              ##
## Daniel Garcia-Costa, Anabel Forte-Deltell, ##
##                                            ##
## Daniel Garcia-Costa, Anabel Forte,         ##
## Emilia López-Iñesta,Flaminio Squazzoni     ##
## and Francisco Grimaldo                     ##
##                                            ##
## 2021                                       ##
################################################

## Requirements:
## Install last versión of JAGS in you computer https://sourceforge.net/projects/mcmc-jags/files/


##------Libraries-----

library(dplyr)
library(ggplot2)
library(hrbrthemes)
library(arm)
library(gridExtra)
library(reshape2)
library(ggridges)
library(rjags)
library(runjags)
library(scales)
library(car)
library(devtools)
library(xtable)
library(nortest)
library(BAS)
library(R2jags)


#set this folder as a working directory
setwd("/media/nas/peerobs_sync/shared/RSOS_Statistical_study/code/")
source("DBDA2E-utilities.R" )

#### REGRESSION MODELS #####

### The following models are performed over the papers for which we have at least with valid review with its text available. Data available in the csv file "data.csv"

data <- read.csv("data.csv")
data <- data %>% filter(category == 'peerReviewed' & !is.na(max_stats_rev))


#######################
## Poisson Regression##
#######################
##-------Data-----
datos_mod <-
  data  %>% transmute(
    questions,
    num_ref,
    max_stats_rev,
    y= final_terms,
    score_all,
    nrounds,
    initial_terms,
    Journal
  )

##-------Variable selection------

fit.bas <-
  bas.glm(
    formula = "y~.",
    data = datos_mod[, -8],
    family = poisson(),
    method = "MCMC+BAS",
    laplace = TRUE
  )


# Table 3
tabla <- data.frame(PIP = fit.bas$probne0)
row.names(tabla) <-
  c(
    "Intercept",
    "Journal Stat guidelines",
    "number of reviewers",
    "max number of stats terms in the reviews",
    "Review Score all rounds",
    "number of rounds",
    "Initial number of terms"
  )

xtable(tabla)

##-------JAGS Data ---------
jags.seed(1612170000)
N <- dim(datos_mod)[1]
#based in the previous model selection
X <-
  as.matrix(datos_mod  %>% dplyr::select(max_stats_rev, initial_terms, nrounds))
X.b <-
  as.matrix(datos_mod %>% dplyr::transmute(as.numeric(as.factor(Journal))))
p <- dim(X)[2]
forJags <- list(
  X = X,
  # predictors
  X.b = X.b,
  nb = length(unique(X.b)),
  y = datos_mod$y,
  # DV
  N = N,
  # sample size
  mu.beta = rep(0, p),
  # priors centered on 0
  tau.beta = diag(.0001, p)
)



##-------JAGS: Model compilation------

# jagsmodel <- jags.model(
#   file = "poisson.bug",
#   data = forJags,
#   n.chains = 1,
#   n.adapt = 1e6
# )
# 
# update(jagsmodel, n.iter = 1e4)

##-------JAGS: Update------

# out <-
#   coda.samples(
#     jagsmodel,
#     variable.names = c("beta0", "beta", "sigma", "b"),
#     n.iter = 300000,
#     thin = 100
#   )
# save(out, file = "poiss.RData")



## Our simulation is saved in the file: poiss.RData
load(file = "poiss.RData")





##------ JAGS: Results----

Intercept <- unlist(out[, "beta0"])
beta_stat_rev <- unlist(out[, "beta[1]"])
beta_initial <- unlist(out[, "beta[2]"])
beta_nrounds <- unlist(out[, "beta[3]"])
sigma <- unlist(out[,"sigma"])

##-------JAGS: Convergence cheking-----
plot(Intercept, type="l")
plot(beta_stat_rev, type="l")
plot(beta_initial, type="l")
plot(beta_nrounds, type="l")
plot(sigma, type="l")

#Figure 2

data_plot <- data.frame(initial = 0:28, m_final= NA, q1_final=NA, q3_final=NA)
for(i in 1:dim(data_plot)[1]){
  sims <- exp(Intercept + beta_stat_rev*5+ beta_initial*data_plot$initial[i]+ beta_nrounds*1)
  data_plot$m_final[i] <- median(sims)
  data_plot$q1_final[i] <- quantile(sims,0.025)
  data_plot$q3_final[i] <- quantile(sims, 0.975)
}


initial1<- ggplot(data_plot, aes(x=initial, y=m_final))+ geom_line() + geom_abline(slope = 1,intercept = 0,lty="dashed")+ geom_ribbon(data=data_plot, aes(ymin=q1_final, ymax=q3_final), alpha=0.3)+ylim(0,max(data_plot$q3_final)) +theme_ipsum() + xlab("") + ylab("")

data_plot <- data.frame(initial = 0:28, m_final= NA, q1_final=NA, q3_final=NA)
for(i in 1:dim(data_plot)[1]){
  sims <- exp(Intercept + beta_stat_rev*25+ beta_initial*data_plot$initial[i]+ beta_nrounds*1)
  data_plot$m_final[i] <- mean(sims)
  data_plot$q1_final[i] <- quantile(sims,0.025)
  data_plot$q3_final[i] <- quantile(sims, 0.975)
}


initial2<- ggplot(data_plot, aes(x=initial, y=m_final))+ geom_line() + geom_abline(slope = 1,intercept = 0,lty="dashed")+ geom_ribbon(data=data_plot, aes(ymin=q1_final, ymax=q3_final), alpha=0.3)+ylim(0,max(data_plot$q3_final)) +theme_ipsum() + xlab("") + ylab("")

p <- grid.arrange(initial1,initial2,nrow=1)

p
#Figure 3

#finalnumber of terms by maximum number of terms in the reports

data_plot <- data.frame(review = 0:20, m_final= NA, q1_final=NA, q3_final=NA)
for(i in 1:dim(data_plot)[1]){
  sims <- exp(Intercept + beta_stat_rev*data_plot$review[i]+ beta_initial*10+ beta_nrounds*2)
  data_plot$m_final[i] <- median(sims)
  data_plot$q1_final[i] <- quantile(sims,0.025)
  data_plot$q3_final[i] <- quantile(sims, 0.975)
}

ggplot(data_plot, aes(x=review, y=m_final))+ geom_line() + geom_ribbon(data=data_plot, aes(ymin=q1_final, ymax=q3_final), alpha=0.3) +theme_ipsum()+ xlab("") + ylab("")+ geom_hline(yintercept = 10, lty="dashed") + ylim(8,12)


#finalnumber of terms by number of rounds
data_plot <- data.frame(rounds = 1:6, m_final= NA, q1_final=NA, q3_final=NA)

for(i in 1:dim(data_plot)[1]){
  sims <- exp(Intercept + beta_stat_rev*mean(datos_mod$max_stats_rev)+ beta_initial*10 + beta_nrounds*data_plot$rounds[i])
  data_plot$m_final[i] <- median(sims)
  data_plot$q1_final[i] <- quantile(sims,0.025)
  data_plot$q3_final[i] <- quantile(sims, 0.975)
}


ggplot(data_plot, aes(x=rounds, y=m_final))+ geom_line() + geom_ribbon(data=data_plot, aes(ymin=q1_final, ymax=q3_final), alpha=0.3) +theme_ipsum()+ xlab("") +ylim(c(8,12))+ ylab("")+ geom_abline(slope = 0, intercept = 10, lty="dashed") 



#Figure 11
aux <- data.frame(sims = exp(beta_stat_rev))

p1<- ggplot(aux, aes(x=sims))+geom_density()+ xlab("")+ylab("")+theme_ipsum()+ geom_vline(xintercept = 1,lty="dashed")

aux <- data.frame(sims = exp(beta_initial))

p2<-ggplot(aux, aes(x=sims))+geom_density()+ xlab("")+ylab("")+theme_ipsum() + geom_vline(xintercept = 1,lty="dashed")

aux <- data.frame(sims = exp(beta_nrounds))

p3 <- ggplot(aux, aes(x=sims))+geom_density()+ xlab("")+ ylab("") + theme_ipsum()+ geom_vline(xintercept = 1,lty="dashed")

p <- grid.arrange(p1,p2,p3,nrow=2)

#Figure 12
data_bs <- data.frame(simsb1 = exp(unlist(out[,"b[1]"])), simsb7 = exp(unlist(out[,"b[2]"])),simsb8 = exp(unlist(out[,"b[3]"])),simsb11 = exp(unlist(out[,"b[4]"])))


ggplot(data_bs, aes(x=simsb1, color="Journal 1"))+stat_density(geom="line",position="identity")+ stat_density(aes(x=simsb7, color="Journal 7"),geom="line",position="identity")+stat_density(aes(x=simsb8, color="Journal 8"),geom="line",position="identity")+xlab("Journal effect")+stat_density(aes(x=simsb11, color="Journal 11"),geom="line",position="identity")+geom_vline(xintercept =1,lty="dashed")+theme_ipsum()+ labs(colour="")

########################
## Logistic Regression##
########################

##--------Data-------
datos_mod <-
  data  %>% transmute(
    questions,
    num_ref,
    max_stats_rev,
    y= accept,
    dif=final_terms-initial_terms,
    score_all,
    nrounds,
    initial_terms,
    Journal
  )

##--------Variable selection------
fit.bas <-
  bas.glm(
    formula = "y~.",
    data = datos_mod[, -9],
    family = binomial(),
    method = "MCMC+BAS",
    laplace = TRUE
  )

# Table 4
tabla <- data.frame(PIP = fit.bas$probne0)
row.names(tabla) <-
  c(
    "Intercept",
    "Journal Stat guidelines",
    "number of referees",
    "max number of stats terms in the reviews",
    "Difference in the number of terms",
    "Review Score all rounds",
    "number of rounds",
    "Initial number of terms"
  )
xtable(tabla)

##--------JAGS:Data------
jags.seed(1612170000)

N <- dim(datos_mod)[1]

#based in the previous model selection
X <-
  as.matrix(datos_mod %>% dplyr::select(num_ref, max_stats_rev, initial_terms, nrounds))
p <- dim(X)[2]
forJags <- list(
  X = X,
  # predictors
  accept = datos_mod$y,
  # DV
  N = N,
  # sample size
  mu.beta = rep(0, p),
  # priors centered on 0
  tau.beta = diag(.0001, p)
)



##--------JAGS: compilation------
# jagsmodel <-
#   jags.model(
#     file = "logistic.bug",
#     data = forJags,
#     n.chains = 1,
#     n.adapt = 5000
#   )
# 
# update(jagsmodel, n.iter = 1e4)

##--------JAGS: Update------
# out <-
#   coda.samples(
#     jagsmodel,
#     variable.names = c("beta0", "beta"),
#     n.iter = 300000,
#     thin = 100
#   )
save(out, file = "logistic.Rdata")

## Our simulation is saved in the file: logistic.RData

load(file = "logistic.Rdata")



##--------JAGS: Results------

Intercept <- unlist(out[, "beta0"])
beta_stat_rev <- unlist(out[, "beta[2]"])
beta_nref <- unlist(out[, "beta[1]"])
beta_score <- unlist(out[, "beta[3]"])
beta_nrounds <- unlist(out[, "beta[4]"])


##--------JAGS: Convergence cheking-----
plot(Intercept, type="l")
plot(beta_stat_rev, type="l")
plot(beta_nref, type="l")
plot(beta_score, type="l")
plot(beta_nrounds, type="l")

# Figure 4

#Top left
data_plot <- data.frame(rounds = 1:6, m_final= NA, q1_final=NA, q3_final=NA)
for(i in 1:dim(data_plot)[1]){
  sims <- invlogit(Intercept + beta_stat_rev*median(datos_mod$max_stats_rev)+ beta_nref*median(datos_mod$num_ref) + beta_nrounds*data_plot$rounds[i]+beta_score*median(datos_mod$score_all))
  data_plot$m_final[i] <- mean(sims)
  data_plot$q1_final[i] <- quantile(sims,0.025)
  data_plot$q3_final[i] <- quantile(sims, 0.975)
}

ggplot(data_plot, aes(x=rounds, y=m_final))+ geom_line() + geom_ribbon(data=data_plot, aes(ymin=q1_final, ymax=q3_final), alpha=0.3) +theme_ipsum()+ xlab("") +ylim(c(0,1))+ ylab("") 

#Top right
data_plot <- data.frame(score = seq(0,1,length.out = 30), m_final= NA, q1_final=NA, q3_final=NA)
for(i in 1:dim(data_plot)[1]){
  sims <- invlogit(Intercept + beta_stat_rev*median(datos_mod$max_stats_rev)+ beta_nref*median(datos_mod$num_ref) + beta_nrounds*2 + beta_score*data_plot$score[i])
  data_plot$m_final[i] <- mean(sims)
  data_plot$q1_final[i] <- quantile(sims,0.025)
  data_plot$q3_final[i] <- quantile(sims, 0.975)
}

ggplot(data_plot, aes(x=score, y=m_final))+ geom_line()+ geom_ribbon(data=data_plot, aes(ymin=q1_final, ymax=q3_final), alpha=0.3)+ylim(0,1) +theme_ipsum() + xlab("") + ylab("")

#Bottom left
data_plot <- data.frame(nref = 1:4, m_final= NA, q1_final=NA, q3_final=NA)
for(i in 1:dim(data_plot)[1]){
  sims <- invlogit(Intercept + beta_stat_rev*median(datos_mod$max_stats_rev)+ beta_nref*data_plot$nref[i] + beta_nrounds*2 + beta_score*0.33)
  data_plot$m_final[i] <- mean(sims)
  data_plot$q1_final[i] <- quantile(sims,0.025)
  data_plot$q3_final[i] <- quantile(sims, 0.975)
}

ggplot(data_plot, aes(x=nref, y=m_final))+ geom_line() + geom_ribbon(data=data_plot, aes(ymin=q1_final, ymax=q3_final), alpha=0.3)+ylim(0,1) +theme_ipsum() + xlab("") + ylab("")

#Bottom right
data_plot <- data.frame(review = 0:20, m_final= NA, q1_final=NA, q3_final=NA)
for(i in 1:dim(data_plot)[1]){
  sims <- invlogit(Intercept + beta_stat_rev*data_plot$review[i]+ beta_nref*median(datos_mod$num_ref) + beta_nrounds*2 +beta_score*median(datos_mod$score_all))
  data_plot$m_final[i] <- mean(sims)
  data_plot$q1_final[i] <- quantile(sims,0.025)
  data_plot$q3_final[i] <- quantile(sims, 0.975)
}


ggplot(data_plot, aes(x=review, y=m_final))+ geom_line() + geom_ribbon(data=data_plot, aes(ymin=q1_final, ymax=q3_final), alpha=0.3) +theme_ipsum()+ xlab("") + ylim(c(0,1))+ ylab("") 


# Figure 13
data_stat_rev <- data.frame(sims = exp(unlist(out[,"beta[1]"])))
p1<- ggplot(data_stat_rev, aes(x=sims))+geom_density()+ xlab("")+ylab("")+ geom_vline(xintercept = 1,lty="dashed") +theme_ipsum()

data_initial <- data.frame(sims = exp(unlist(out[,"beta[2]"])))
p2<-ggplot(data_initial, aes(x=sims))+geom_density()+ xlab("")+ylab("")+theme_ipsum() + geom_vline(xintercept = 1, lty="dashed")

data_score <- data.frame(sims = exp(unlist(out[,"beta[3]"])))
p3 <- ggplot(data_score, aes(x=sims))+geom_density()+ xlab("")+ylab("")+theme_ipsum()+ geom_vline(xintercept = 1,lty="dashed")

data_score <- data.frame(sims = exp(unlist(out[,"beta[4]"])))
p4 <- ggplot(data_score, aes(x=sims))+geom_density()+ xlab("")+ylab("")+theme_ipsum()+ geom_vline(xintercept = 1,lty="dashed")


p <- grid.arrange(p1,p2,p3,p4,nrow=2)
